##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.1.6
## ✔ ggplot2 4.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'janitor'
##
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
##
##
## Loading required package: lattice
##
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loaded glmnet 4.1-10
##
## Loading required package: mvtnorm
##
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## Type 'citation("pROC")' for a citation.
##
##
## Attaching package: 'pROC'
##
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
##
## randomForest 4.7-1.2
##
## Type rfNews() to see new features/changes/bug fixes.
##
##
## Attaching package: 'randomForest'
##
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
##
## The following object is masked from 'package:dplyr':
##
## combine
df <- readr::read_tsv("data/marketing_campaign.csv") %>%
janitor::clean_names()
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Content
AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise Complain - 1 if customer complained in the last 2 years DtCustomer - date of customer’s enrolment with the company Education - customer’s level of education Marital - customer’s marital status Kidhome - number of small children in customer’s household Teenhome - number of teenagers in customer’s household Income - customer’s yearly household income MntFishProducts - amount spent on fish products in the last 2 years MntMeatProducts - amount spent on meat products in the last 2 years MntFruits - amount spent on fruits products in the last 2 years MntSweetProducts - amount spent on sweet products in the last 2 years MntWines - amount spent on wine products in the last 2 years MntGoldProds - amount spent on gold products in the last 2 years NumDealsPurchases - number of purchases made with discount NumCatalogPurchases - number of purchases made using catalogue NumStorePurchases - number of purchases made directly in stores NumWebPurchases - number of purchases made through company’s web site NumWebVisitsMonth - number of visits to company’s web site in the last month Recency - number of days since the last purchase
glimpse(df)
## Rows: 2,240
## Columns: 29
## $ id <dbl> 5524, 2174, 4141, 6182, 5324, 7446, 965, 6177, 4…
## $ year_birth <dbl> 1957, 1954, 1965, 1984, 1981, 1967, 1971, 1985, …
## $ education <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status <chr> "Single", "Single", "Together", "Together", "Mar…
## $ income <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635,…
## $ kidhome <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ teenhome <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ dt_customer <chr> "04-09-2012", "08-03-2014", "21-08-2013", "10-02…
## $ recency <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 11, 59, …
## $ mnt_wines <dbl> 635, 11, 426, 11, 173, 520, 235, 76, 14, 28, 5, …
## $ mnt_fruits <dbl> 88, 1, 49, 4, 43, 42, 65, 10, 0, 0, 5, 16, 61, 2…
## $ mnt_meat_products <dbl> 546, 6, 127, 20, 118, 98, 164, 56, 24, 6, 6, 11,…
## $ mnt_fish_products <dbl> 172, 2, 111, 10, 46, 0, 50, 3, 3, 1, 0, 11, 225,…
## $ mnt_sweet_products <dbl> 88, 1, 21, 3, 27, 42, 49, 1, 3, 1, 2, 1, 112, 5,…
## $ mnt_gold_prods <dbl> 88, 6, 42, 5, 15, 14, 27, 23, 2, 13, 1, 16, 30, …
## $ num_deals_purchases <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ num_web_purchases <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 1, 1, 2, 3, 6, 1, 7, …
## $ num_catalog_purchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 0, 0, 4, 1, 0, 6,…
## $ num_store_purchases <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 0, 2, 3, 8, 5, 3, 1…
## $ num_web_visits_month <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 7, 8, 2, 6, 8, 3,…
## $ accepted_cmp3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ complain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ z_cost_contact <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ z_revenue <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ response <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
dim(df)
## [1] 2240 29
Only the “Income” variable has zero values. Since there are only 24 missing values, it was decided to eliminate them directly.
skimr::skim(df)
| Name | df |
| Number of rows | 2240 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| education | 0 | 1 | 3 | 10 | 0 | 5 | 0 |
| marital_status | 0 | 1 | 4 | 8 | 0 | 8 | 0 |
| dt_customer | 0 | 1 | 10 | 10 | 0 | 663 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 5592.16 | 3246.66 | 0 | 2828.25 | 5458.5 | 8427.75 | 11191 | ▇▇▇▇▇ |
| year_birth | 0 | 1.00 | 1968.81 | 11.98 | 1893 | 1959.00 | 1970.0 | 1977.00 | 1996 | ▁▁▂▇▅ |
| income | 24 | 0.99 | 52247.25 | 25173.08 | 1730 | 35303.00 | 51381.5 | 68522.00 | 666666 | ▇▁▁▁▁ |
| kidhome | 0 | 1.00 | 0.44 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▆▁▁ |
| teenhome | 0 | 1.00 | 0.51 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▇▁▁ |
| recency | 0 | 1.00 | 49.11 | 28.96 | 0 | 24.00 | 49.0 | 74.00 | 99 | ▇▇▇▇▇ |
| mnt_wines | 0 | 1.00 | 303.94 | 336.60 | 0 | 23.75 | 173.5 | 504.25 | 1493 | ▇▂▂▁▁ |
| mnt_fruits | 0 | 1.00 | 26.30 | 39.77 | 0 | 1.00 | 8.0 | 33.00 | 199 | ▇▁▁▁▁ |
| mnt_meat_products | 0 | 1.00 | 166.95 | 225.72 | 0 | 16.00 | 67.0 | 232.00 | 1725 | ▇▁▁▁▁ |
| mnt_fish_products | 0 | 1.00 | 37.53 | 54.63 | 0 | 3.00 | 12.0 | 50.00 | 259 | ▇▁▁▁▁ |
| mnt_sweet_products | 0 | 1.00 | 27.06 | 41.28 | 0 | 1.00 | 8.0 | 33.00 | 263 | ▇▁▁▁▁ |
| mnt_gold_prods | 0 | 1.00 | 44.02 | 52.17 | 0 | 9.00 | 24.0 | 56.00 | 362 | ▇▁▁▁▁ |
| num_deals_purchases | 0 | 1.00 | 2.33 | 1.93 | 0 | 1.00 | 2.0 | 3.00 | 15 | ▇▂▁▁▁ |
| num_web_purchases | 0 | 1.00 | 4.08 | 2.78 | 0 | 2.00 | 4.0 | 6.00 | 27 | ▇▃▁▁▁ |
| num_catalog_purchases | 0 | 1.00 | 2.66 | 2.92 | 0 | 0.00 | 2.0 | 4.00 | 28 | ▇▂▁▁▁ |
| num_store_purchases | 0 | 1.00 | 5.79 | 3.25 | 0 | 3.00 | 5.0 | 8.00 | 13 | ▂▇▂▃▂ |
| num_web_visits_month | 0 | 1.00 | 5.32 | 2.43 | 0 | 3.00 | 6.0 | 7.00 | 20 | ▅▇▁▁▁ |
| accepted_cmp3 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp4 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp5 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp1 | 0 | 1.00 | 0.06 | 0.25 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp2 | 0 | 1.00 | 0.01 | 0.11 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| complain | 0 | 1.00 | 0.01 | 0.10 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| z_cost_contact | 0 | 1.00 | 3.00 | 0.00 | 3 | 3.00 | 3.0 | 3.00 | 3 | ▁▁▇▁▁ |
| z_revenue | 0 | 1.00 | 11.00 | 0.00 | 11 | 11.00 | 11.0 | 11.00 | 11 | ▁▁▇▁▁ |
| response | 0 | 1.00 | 0.15 | 0.36 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
df_clean <- df %>%
mutate(
across(c(
id, education, marital_status,
kidhome, teenhome,
accepted_cmp1, accepted_cmp2, accepted_cmp3,
accepted_cmp4, accepted_cmp5, response
), as.factor)
)
df_clean <- df_clean %>% tidyr::drop_na(income)
sum(is.na(df_clean))
## [1] 0
The variables “z_cost_contact” and “z_revenue” have a constant value, as they do not add any information and are therefore eliminated from the analysis.
df %>%
summarise(across(everything(), n_distinct)) %>%
pivot_longer(everything(), names_to = "var", values_to = "n_unique") %>%
filter(n_unique == 1)
## # A tibble: 2 × 2
## var n_unique
## <chr> <int>
## 1 z_cost_contact 1
## 2 z_revenue 1
df_clean <- df %>% dplyr::select(-z_cost_contact, -z_revenue)
Two variables with almost zero variance were found and therefore eliminated.
nzv <- nearZeroVar(df_clean, saveMetrics = TRUE)
nzv[nzv$zeroVar | nzv$nzv, ]
## freqRatio percentUnique zeroVar nzv
## accepted_cmp2 73.66667 0.08928571 FALSE TRUE
## complain 105.66667 0.08928571 FALSE TRUE
df_clean <- df_clean[, !nzv$zeroVar & !nzv$nzv]
The “Yolo,” “Absurd,” and ‘Alone’ modes of the “marital_status” variable have been removed, as they have a very low presence and distort the analysis.
df_clean <- df_clean %>%
filter(!marital_status %in% c("YOLO", "Absurd")) %>%
mutate(marital_status = fct_recode(marital_status,
"Single" = "Alone"
)) %>%
mutate(marital_status = droplevels(marital_status))
df_clean <- df_clean %>%
mutate(
marital_status = fct_collapse(marital_status,
"Divorced_Widow" = c("Divorced", "Widow")
)
)
count(df_clean, marital_status)
## # A tibble: 4 × 2
## marital_status n
## <fct> <int>
## 1 Single 483
## 2 Divorced_Widow 309
## 3 Married 864
## 4 Together 580
The variables relating to children and young people in households have been grouped together under a single variable, “Children.”
df_clean <- df_clean %>%
mutate(children = kidhome + teenhome) %>%
dplyr::select(-kidhome, -teenhome)
df_clean <- df_clean %>%
mutate(
total_spent =
mnt_wines + mnt_fruits + mnt_meat_products +
mnt_fish_products + mnt_sweet_products +
mnt_gold_prods
)
df_clean <- df_clean %>%
mutate(age = 2024 - year_birth) %>%
dplyr::select(-year_birth)
A logarithmic transformation was applied to the Income feature to correct for the strong positive asymmetry (right-skewness) typical of income distributions.
df_clean <- df_clean %>%
mutate(income_log = log(income)) %>%
dplyr::select(-income)
df_clean <- df_clean %>%
na.omit(is.na(income_log))
df_clean <- df_clean %>%
mutate(
accepted_any = if_else(rowSums(dplyr::select(., starts_with("Accepted"))) > 0, 1, 0),
accepted_any = as.factor(accepted_any)
)
df_clean <- df_clean %>%
dplyr::select(-accepted_cmp1, -accepted_cmp3, -accepted_cmp4, -accepted_cmp5)
table(df_clean$accepted_any)
##
## 0 1
## 1755 457
dim(df_clean)
## [1] 2212 22
df_clean %>%
ggplot(aes(x = factor(response))) +
geom_bar(fill = "#4E79A7", color = "white", alpha = 0.9, width = 0.6) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Distribution Response",
x = "Response",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = income_log)) +
geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Income",
x = "Log Income",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean <- df_clean %>% filter(id != 9432)
df_clean %>%
ggplot(aes(x = income_log)) +
geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Log Income",
x = "Log Income",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = reorder(education, education, function(x) -length(x)))) +
geom_bar(fill = "#4682B4", color = "white", alpha = 0.8, width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, color = "#333333") +
labs(
title = "Level of Education",
x = "Educational Qualification",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
total_spent is a variable created to show the total expenditure incurred by the various instances. Its marked asymmetry is due to the underlying asymmetry of the various variables that comprise it.
df_clean %>%
ggplot(aes(x = total_spent)) +
geom_histogram(bins = 40, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Total Spent",
x = "Total Spent",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = factor(children))) +
geom_bar(fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Children",
x = "Children",
y = "Counts"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
As can be seen from the graphs, all the “mnt_” variables show strong
asymmetry.
colonne_mnt <- df_clean %>%
dplyr::select(starts_with("mnt")) %>%
names()
lista_grafici <- map(colonne_mnt, function(col_name) {
ggplot(df_clean, aes(x = .data[[col_name]])) +
geom_histogram(bins = 30, fill = "#4682B4", color = "white", alpha = 0.8) +
scale_x_continuous(
labels = label_dollar(prefix = "€ ", big.mark = ".", decimal.mark = ",")
) +
labs(
title = paste("Distribution", col_name),
x = "Amount spent",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
panel.grid.minor = element_blank()
)
})
names(lista_grafici) <- colonne_mnt
walk(lista_grafici, print)
Recency is crucial for calculating the churn rate, indicating the number
of days since the last purchase.
df_clean %>%
ggplot(aes(x = recency)) +
geom_histogram(binwidth = 5, fill = "#4682B4", color = "white", alpha = 0.8) +
scale_x_continuous(breaks = seq(0, 100, 10)) +
labs(
title = "Distribution Recency",
subtitle = "Days since the customer's last purchase",
x = "Days since last purchase",
y = "Number of Customers"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10))
)
There is a strong correlation (0.64) between income and spending on wine (mnt_wines). High income is positively linked to purchases in stores (0.63) and from catalogs (0.59). There is a strong negative correlation (-0.61) between income and visits to the website. The correlation (0.74) is between mnt_meat_products and num_catalog_purchases (those who buy a lot of meat tend to do so via catalog). Those who spend on fish also tend to spend on fruit (0.59), meat (0.57), and desserts (0.59).
df_corr <- df_clean %>%
dplyr::select(where(is.numeric)) %>%
dplyr::select(-total_spent)%>%
dplyr::select(-matches("ID|id"))
corr_matrix <- cor(df_corr, use = "complete.obs")
ggcorrplot(corr_matrix,
method = "square",
type = "lower",
lab = FALSE,
colors = c("#B2182B", "white", "#E41A1C"),
title = "Correlation Matrix",
ggtheme = theme_minimal()
) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid = element_blank()
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
classifica_correlazioni <- as.data.frame(corr_matrix) %>%
rownames_to_column(var = "Var1") %>%
pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Correlazione") %>%
filter(Var1 < Var2) %>%
arrange(desc(abs(Correlazione)))
print("Higher correlations")
## [1] "Higher correlations"
head(classifica_correlazioni, 10)
## # A tibble: 10 × 3
## Var1 Var2 Correlazione
## <chr> <chr> <dbl>
## 1 mnt_meat_products num_catalog_purchases 0.735
## 2 mnt_wines num_store_purchases 0.640
## 3 income_log mnt_wines 0.638
## 4 mnt_wines num_catalog_purchases 0.635
## 5 income_log num_store_purchases 0.625
## 6 income_log num_web_visits_month -0.607
## 7 income_log num_catalog_purchases 0.593
## 8 mnt_fish_products mnt_fruits 0.592
## 9 mnt_fish_products mnt_sweet_products 0.586
## 10 mnt_fish_products mnt_meat_products 0.574
df_clean <- df_clean%>%
dplyr::select(-dt_customer)
df_clean <- df_clean %>%
mutate(
education = fct_collapse(education,
"Undergrad_Grad" = c("Basic", "Graduation")
))
table(df_clean$education)
##
## 2n Cycle Undergrad_Grad Master PhD
## 200 1168 364 479
Customers who tend to spend more are more likely to accept the latest campaign offer, regardless of the product category.
df_clean %>%
dplyr::select(response, starts_with("mnt")) %>%
pivot_longer(cols = -response, names_to = "Product_Category", values_to = "Import") %>%
mutate(response = as.factor(response)) %>%
ggplot(aes(x = response, y = Import, fill = response)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
scale_y_log10(labels = label_dollar(prefix = "€", big.mark = ".", decimal.mark = ",")) +
facet_wrap(~Product_Category, scales = "free_y") +
scale_fill_manual(
values = c("0" = "#9575CD", "1" = "#F06292"),
labels = c("0" = "No", "1" = "Yes")
) +
labs(
title = "Impact of Expenditure on Response",
x = "Did you accept the last offer?",
y = "Amount Spent (Log Scale)"
) +
theme_minimal() +
theme(legend.position = "top")
## Warning in scale_y_log10(labels = label_dollar(prefix = "€", big.mark = ".", :
## log-10 transformation introduced infinite values.
## Warning: Removed 1261 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Those with more children tend not to respond, perhaps due to income or
time constraints. Doctoral students are the best customers, while those
with a basic education tend not to respond. The variable concerning
marital status seems to show that married or cohabiting people are less
likely to respond.
plot_pct <- function(df, var_name, titolo) {
global_mean <- mean(as.numeric(as.character(df$response)), na.rm = TRUE)
df %>%
filter(!is.na(.data[[var_name]])) %>%
ggplot(aes(x = .data[[var_name]], fill = as.factor(response))) +
geom_bar(position = "fill", alpha = 0.9) +
geom_hline(
yintercept = global_mean,
color = "white", linetype = "dashed", linewidth = 1
) +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(
values = c("0" = "#9575CD", "1" = "#F06292"),
name = "Response",
labels = c("No", "Yes")
) +
labs(
title = titolo,
y = "Percentage of Response",
x = ""
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)
}
p1 <- plot_pct(df_clean, "education", "Response for Education Level")
p2 <- plot_pct(df_clean, "marital_status", "Response for Marital Status")
p3 <- plot_pct(df_clean, "children", "Response for Children")
print(p1)
print(p2)
print(p3)
Customers who respond to the latest offer have a median number of days
since their last purchase that is significantly lower than those who do
not respond to the latest campaign.
ggplot(df_clean, aes(x = as.factor(response), y = recency, fill = as.factor(response))) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("0" = "#9575CD", "1" = "#F06292")) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Recency vs Response",
x = "Response",
y = "Recency"
) +
theme_minimal() +
theme(legend.position = "none")
As expected, total spending is higher among consumers who are more
likely to accept advertising campaigns.
df_clean %>%
ggplot(aes(x = factor(response), y = total_spent, fill = factor(response))) +
geom_boxplot(alpha = 0.8, outlier.colour = "#FF5555", outlier.size = 2) +
scale_fill_manual(values = c("violet", "blue")) +
labs(
title = "Total spent for Response",
x = "Response",
y = "Total Spent"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "none"
)
df_logistic <- df_clean %>%
mutate(response = as.factor(response))
set.seed(1926)
idx <- caret::createDataPartition(
df_logistic$response,
p = 0.7,
list = FALSE
)
train <- df_logistic[idx, ]
train <- train %>%
dplyr::select(-id)
test <- df_logistic[-idx, ]
test <- test%>%
dplyr::select((-id))
Weights for imbalanced class
class_weights <- ifelse(
train$response == 1,
sum(train$response == 0) / nrow(train),
sum(train$response == 1) / nrow(train)
)
Step wise on train set “The warning related to non-integer successes is expected due to the use of continuous class weights and does not affect model validity.”
full_model <- suppressWarnings(glm(
response ~ age + education + marital_status + income_log +
children + recency + mnt_wines + mnt_fruits +
mnt_meat_products + mnt_fish_products + mnt_sweet_products +
mnt_gold_prods + num_deals_purchases + num_web_purchases +
num_catalog_purchases + num_store_purchases +
num_web_visits_month + accepted_any,
data = train,
family = binomial,
weights = class_weights
))
null_model <- suppressWarnings(glm(
response ~ 1,
data = train,
family = binomial,
weights = class_weights
))
best_model <- suppressWarnings(stepAIC(
null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both",
trace = TRUE
))
## Start: AIC=323.62
## response ~ 1
##
## Df Deviance AIC
## + accepted_any 1 467.19 245.98
## + num_catalog_purchases 1 499.97 278.75
## + mnt_meat_products 1 505.82 284.60
## + mnt_wines 1 505.92 284.70
## + recency 1 516.11 294.89
## + num_web_purchases 1 523.28 302.06
## + children 1 523.99 302.77
## + income_log 1 528.90 307.68
## + marital_status 3 528.12 310.90
## + mnt_fruits 1 533.39 312.17
## + mnt_sweet_products 1 533.60 312.38
## + mnt_fish_products 1 535.53 314.31
## + mnt_gold_prods 1 535.97 314.75
## + num_store_purchases 1 544.40 323.18
## <none> 546.84 323.62
## + education 3 541.32 324.10
## + num_web_visits_month 1 546.68 325.46
## + num_deals_purchases 1 546.78 325.56
## + age 1 546.84 325.62
##
## Step: AIC=283.39
## response ~ accepted_any
##
## Df Deviance AIC
## + recency 1 437.19 255.38
## + mnt_meat_products 1 447.69 265.89
## + num_catalog_purchases 1 450.32 268.52
## + marital_status 3 448.14 270.33
## + num_web_purchases 1 458.94 277.13
## + mnt_wines 1 459.60 277.80
## + children 1 460.13 278.33
## + mnt_fruits 1 460.98 279.18
## + mnt_sweet_products 1 461.97 280.17
## + mnt_fish_products 1 463.57 281.76
## + num_deals_purchases 1 463.74 281.94
## + education 3 460.22 282.42
## + income_log 1 464.81 283.01
## + mnt_gold_prods 1 465.10 283.29
## <none> 467.19 283.39
## + num_web_visits_month 1 465.57 283.77
## + age 1 467.06 285.26
## + num_store_purchases 1 467.17 285.37
## - accepted_any 1 546.84 361.04
##
## Step: AIC=266.45
## response ~ accepted_any + recency
##
## Df Deviance AIC
## + num_catalog_purchases 1 417.32 248.58
## + mnt_meat_products 1 417.48 248.74
## + marital_status 3 417.48 252.74
## + mnt_wines 1 426.88 258.14
## + num_web_purchases 1 427.23 258.49
## + children 1 430.48 261.74
## + mnt_fruits 1 430.60 261.86
## + mnt_sweet_products 1 430.71 261.97
## + education 3 429.37 264.63
## + mnt_fish_products 1 433.43 264.69
## + num_deals_purchases 1 433.96 265.22
## + income_log 1 434.31 265.57
## + mnt_gold_prods 1 434.99 266.25
## <none> 437.19 266.45
## + num_web_visits_month 1 435.72 266.98
## + num_store_purchases 1 437.12 268.38
## + age 1 437.15 268.41
## - recency 1 467.19 294.45
## - accepted_any 1 516.11 343.37
##
## Step: AIC=255.3
## response ~ accepted_any + recency + num_catalog_purchases
##
## Df Deviance AIC
## + num_web_visits_month 1 398.50 238.47
## + marital_status 3 398.89 242.86
## + num_deals_purchases 1 411.78 251.76
## + num_store_purchases 1 412.30 252.28
## + mnt_meat_products 1 413.31 253.28
## + education 3 409.98 253.96
## + num_web_purchases 1 414.70 254.68
## + income_log 1 415.23 255.21
## <none> 417.32 255.30
## + age 1 416.43 256.40
## + mnt_wines 1 416.87 256.84
## + children 1 416.99 256.96
## + mnt_fish_products 1 417.00 256.98
## + mnt_fruits 1 417.17 257.14
## + mnt_sweet_products 1 417.27 257.25
## + mnt_gold_prods 1 417.28 257.25
## - num_catalog_purchases 1 437.19 273.16
## - recency 1 450.32 286.30
## - accepted_any 1 463.94 299.92
##
## Step: AIC=240.57
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month
##
## Df Deviance AIC
## + marital_status 3 379.27 227.34
## + mnt_meat_products 1 383.27 227.35
## + children 1 394.30 238.37
## + education 3 392.13 240.20
## <none> 398.50 240.57
## + mnt_fruits 1 396.52 240.59
## + num_store_purchases 1 396.63 240.70
## + mnt_sweet_products 1 397.07 241.15
## + age 1 397.50 241.57
## + num_deals_purchases 1 398.05 242.13
## + mnt_wines 1 398.09 242.16
## + mnt_fish_products 1 398.22 242.29
## + num_web_purchases 1 398.25 242.32
## + income_log 1 398.26 242.33
## + mnt_gold_prods 1 398.27 242.35
## - num_web_visits_month 1 417.32 257.40
## - recency 1 432.73 272.81
## - num_catalog_purchases 1 435.72 275.80
## - accepted_any 1 445.35 285.42
##
## Step: AIC=234.03
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month +
## marital_status
##
## Df Deviance AIC
## + mnt_meat_products 1 365.05 221.81
## + children 1 376.14 232.91
## + num_store_purchases 1 376.16 232.93
## + education 3 373.20 233.97
## <none> 379.27 234.03
## + mnt_fruits 1 377.79 234.55
## + age 1 378.22 234.99
## + mnt_sweet_products 1 378.49 235.26
## + num_deals_purchases 1 378.80 235.56
## + mnt_fish_products 1 378.85 235.62
## + mnt_wines 1 378.87 235.64
## + num_web_purchases 1 378.99 235.76
## + mnt_gold_prods 1 379.03 235.80
## + income_log 1 379.05 235.82
## - marital_status 3 398.50 247.26
## - num_web_visits_month 1 398.89 251.65
## - recency 1 413.53 266.29
## - num_catalog_purchases 1 415.68 268.45
## - accepted_any 1 426.38 279.15
##
## Step: AIC=225.45
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month +
## marital_status + mnt_meat_products
##
## Df Deviance AIC
## + num_store_purchases 1 360.11 222.51
## + education 3 357.63 224.04
## <none> 365.05 225.45
## + age 1 364.06 226.46
## + children 1 364.17 226.57
## + mnt_gold_prods 1 364.38 226.78
## + num_deals_purchases 1 364.39 226.79
## + income_log 1 364.80 227.21
## + mnt_fish_products 1 364.83 227.24
## + mnt_wines 1 364.94 227.34
## + mnt_fruits 1 365.00 227.40
## + mnt_sweet_products 1 365.01 227.41
## + num_web_purchases 1 365.02 227.43
## - num_catalog_purchases 1 377.11 235.51
## - mnt_meat_products 1 379.27 237.67
## - marital_status 3 383.27 237.68
## - num_web_visits_month 1 395.37 253.77
## - recency 1 398.79 257.20
## - accepted_any 1 411.48 269.88
##
## Step: AIC=225.32
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month +
## marital_status + mnt_meat_products + num_store_purchases
##
## Df Deviance AIC
## + education 3 352.27 223.48
## + num_deals_purchases 1 357.89 225.10
## <none> 360.11 225.32
## + children 1 359.04 226.25
## + num_web_purchases 1 359.15 226.36
## + age 1 359.44 226.65
## + mnt_gold_prods 1 359.77 226.99
## + mnt_wines 1 359.84 227.05
## + mnt_fruits 1 359.89 227.10
## + income_log 1 359.96 227.17
## + mnt_fish_products 1 360.00 227.21
## + mnt_sweet_products 1 360.09 227.30
## - num_store_purchases 1 365.05 228.26
## - num_catalog_purchases 1 375.64 238.85
## - marital_status 3 379.85 239.07
## - mnt_meat_products 1 376.16 239.38
## - num_web_visits_month 1 386.07 249.29
## - recency 1 393.35 256.57
## - accepted_any 1 404.76 267.98
##
## Step: AIC=226.72
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month +
## marital_status + mnt_meat_products + num_store_purchases +
## education
The strongest predictor is accepted_any1,if a customer has accepted offers in the past, they are much more likely to respond again.
The level of education affects the probability of response (it increases as the level of education increases). Marital status also has an impact: single people are more likely to respond. Recencyas the number of days since the last purchase increases, the probability of response decreases.
The error made by the model is much lower than the error made by the model without variables, so it can be said that the variables are explaining the phenomenon well.
best_model
##
## Call: glm(formula = response ~ accepted_any + recency + num_catalog_purchases +
## num_web_visits_month + marital_status + mnt_meat_products +
## num_store_purchases + education, family = binomial, data = train,
## weights = class_weights)
##
## Coefficients:
## (Intercept) accepted_any1
## -2.209697 1.944504
## recency num_catalog_purchases
## -0.026595 0.243715
## num_web_visits_month marital_statusDivorced_Widow
## 0.364284 0.092536
## marital_statusMarried marital_statusTogether
## -1.124751 -1.145687
## mnt_meat_products num_store_purchases
## 0.003287 -0.115229
## educationUndergrad_Grad educationMaster
## 0.286017 0.580893
## educationPhD
## 1.105900
##
## Degrees of Freedom: 1547 Total (i.e. Null); 1535 Residual
## Null Deviance: 546.8
## Residual Deviance: 352.3 AIC: 226.7
prob_test <- predict(best_model, test, type = "response")
The most reliable variables appear to be accepted_any1, num_web_visits_month, recency, and the “Married” and “Together” modes of marital_status.
suppressWarnings(exp(cbind(OR = coef(best_model), confint(best_model))))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.1097339 0.01904037 0.5811542
## accepted_any1 6.9901646 3.89528609 12.9685251
## recency 0.9737560 0.96425081 0.9828336
## num_catalog_purchases 1.2759805 1.12140206 1.4637073
## num_web_visits_month 1.4394830 1.23896692 1.6919729
## marital_statusDivorced_Widow 1.0969526 0.50066277 2.4188736
## marital_statusMarried 0.3247335 0.16015785 0.6434069
## marital_statusTogether 0.3180054 0.14543035 0.6783637
## mnt_meat_products 1.0032920 1.00171513 1.0049890
## num_store_purchases 0.8911617 0.80535213 0.9826276
## educationUndergrad_Grad 1.3311152 0.47104501 3.8809111
## educationMaster 1.7876336 0.55188798 5.9663140
## educationPhD 3.0219425 1.00942382 9.4254582
num_catalog_purchases, mnt_meat_products, and num_web_visits_month: these have the highest values; people who buy a lot of meat often use the catalog and visit the website. Since they are below 5, the model can distinguish the impact of each.
education and marital_status: these have values close to 1.0, meaning that education level and marital status are totally independent of spending habits in your dataset.
vif(best_model)
## GVIF Df GVIF^(1/(2*Df))
## accepted_any 1.142110 1 1.068695
## recency 1.091630 1 1.044811
## num_catalog_purchases 2.320847 1 1.523433
## num_web_visits_month 2.092538 1 1.446561
## marital_status 1.087119 3 1.014019
## mnt_meat_products 2.171771 1 1.473693
## num_store_purchases 1.553058 1 1.246217
## education 1.073621 3 1.011910
roc_obj <- roc(
response = as.numeric(as.character(test$response)),
predictor = prob_test,
direction = "<"
)
## Setting levels: control = 0, case = 1
auc_value <- auc(roc_obj)
If you take a random customer who responded and one who did not respond, the model has an 85.7% chance of assigning a higher score to the one who actually responded. The curve rises rapidly toward the upper left corner, indicating that you can achieve high sensitivity without sacrificing too much specificity (it does not confuse no responses).
plot(
roc_obj,
col = "skyblue",
lwd = 2,
main = paste("ROC Curve - Logistic Model (AUC =", round(auc_value, 3), ")")
)
abline(a = 0, b = 1, lty = 2, col = "gray")
opt <- coords(
roc_obj,
x = "best",
best.method = "youden",
ret = c("threshold", "sensitivity", "specificity")
)
opt
## threshold sensitivity specificity
## 1 0.4565964 0.7878788 0.7960993
Classification with best threshold
threshold <- opt["threshold"]
threshold <- as.numeric(opt[1])
pred_class <- ifelse(prob_test >= threshold, 1, 0)
pred_class <- factor(pred_class, levels = c(0, 1))
length(pred_class)
## [1] 663
length(test$response)
## [1] 663
Specificity (true negatives) identifies 80% of those who will not respond. Sensitivity is 0.788, indicating that you can identify about 79% of interested customers.
There are a total of 21 false negatives that would have responded yes, and the model tends to make the less serious error (115 false positives).
Confusion Matrix
cm <- confusionMatrix(
pred_class,
test$response,
positive = "1"
)
cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "#F7FBFF", high = "skyblue") +
labs(
title = "Confusion Matrix – Logistic Regression",
x = "Actual Class",
y = "Predicted Class",
fill = "Count"
) +
theme_minimal(base_size = 14)
Metrics
cm <- confusionMatrix(pred_class, test$response, positive = "1")
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
specificity <- cm$byClass["Specificity"]
balanced_acc <- cm$byClass["Balanced Accuracy"]
metrics_logistic <- tibble::tibble(
Model = c("Logistica"),
AUC = as.numeric(auc_value),
Precision = as.numeric(cm$byClass["Precision"]),
Recall = as.numeric(cm$byClass["Recall"]),
F1 = as.numeric(cm$byClass["F1"]),
Specificity = as.numeric(cm$byClass["Specificity"]),
Balanced_Accuracy = as.numeric(cm$byClass["Balanced Accuracy"])
)
metrics_logistic
## # A tibble: 1 × 7
## Model AUC Precision Recall F1 Specificity Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Logistica 0.857 0.404 0.788 0.534 0.796 0.792
Class imbalance was addressed using weights calculated previously with logistics. Elements of the minority class are assigned a weight equal to the frequency of the majority class, and viceversa. This forces the models to engage in the same way for observations from both classes.
cat_cols <- names(train)[sapply(train, function(x) is.factor(x) || is.character(x))]
cat_cols <- setdiff(cat_cols, "response")
for (col in cat_cols) {
train[[col]] <- as.factor(train[[col]])
test[[col]] <- factor(test[[col]], levels = levels(train[[col]]))
}
X_train <- model.matrix(response ~ ., data = train)[, -1]
y_train <- train$response
test_clean <- na.omit(test)
X_test <- model.matrix(response ~ ., data = test_clean)[, -1]
y_test <- test_clean$response
X_train_scaled <- scale(X_train)
train_means <- attr(X_train_scaled, "scaled:center")
train_sds <- attr(X_train_scaled, "scaled:scale")
X_test_scaled <- scale(X_test, center = train_means, scale = train_sds)
X_train_scaled <- X_train_scaled[, colnames(X_train_scaled) != "id"]
X_test_scaled <- X_test_scaled[, colnames(X_test_scaled) != "id"]
set.seed(1926)
lasso_model <- suppressWarnings(cv.glmnet(
x = X_train_scaled,
y = y_train,
family = "binomial",
alpha = 1,
weights = class_weights,
type.measure = "auc",
standardize = FALSE
))
prob_lasso <- predict(lasso_model, X_test_scaled, type = "response", s = "lambda.min")
The most predictive variables in this case are also accepted_Any1, num_web_visits_month, mnt_meat_products, and num_catalog_purchases. Recency, being in a relationship, and those who tend to go to the store respond less to the latest advertising campaign.
A slight negative effect is also given by the type of education, the number of children, and age.
lasso_coefs <- coef(lasso_model, s = "lambda.min")
lasso_coefs_df <- as.data.frame(as.matrix(lasso_coefs))
colnames(lasso_coefs_df) <- "Coefficiente"
selected_features <- lasso_coefs_df[lasso_coefs_df$Coefficiente != 0, , drop = FALSE]
print(selected_features)
## Coefficiente
## (Intercept) -0.80505974
## educationMaster 0.07414495
## educationPhD 0.31827324
## marital_statusDivorced_Widow 0.03894996
## marital_statusMarried -0.45274937
## marital_statusTogether -0.40690552
## recency -0.69423248
## mnt_fruits 0.05899273
## mnt_meat_products 0.56346952
## mnt_sweet_products 0.03120537
## num_deals_purchases 0.29460892
## num_web_purchases 0.02534825
## num_catalog_purchases 0.54774314
## num_store_purchases -0.41233530
## num_web_visits_month 0.67618126
## children -0.29690081
## age -0.08559063
## accepted_any1 0.74755015
set.seed(1926)
ridge_model <- suppressWarnings(cv.glmnet(
x = X_train_scaled,
y = y_train,
family = "binomial",
alpha = 0,
weights = class_weights,
type.measure = "auc",
standardize = FALSE
))
prob_ridge <- predict(ridge_model, X_test_scaled, type = "response", s = "lambda.min")
ridge_coefs <- coef(ridge_model, s = "lambda.min")
ridge_coefs_df <- as.data.frame(as.matrix(ridge_coefs))
colnames(ridge_coefs_df) <- "Coefficiente"
selected_features <- ridge_coefs_df[ridge_coefs_df$Coefficiente != 0, , drop = FALSE]
print(selected_features)
## Coefficiente
## (Intercept) -0.697028903
## educationUndergrad_Grad -0.009367024
## educationMaster 0.073622350
## educationPhD 0.272118515
## marital_statusDivorced_Widow 0.090013229
## marital_statusMarried -0.347226518
## marital_statusTogether -0.318800317
## recency -0.595037221
## mnt_wines 0.007763906
## mnt_fruits 0.076548042
## mnt_meat_products 0.387010667
## mnt_fish_products -0.038078655
## mnt_sweet_products 0.056292145
## mnt_gold_prods -0.028271049
## num_deals_purchases 0.257222909
## num_web_purchases 0.087385594
## num_catalog_purchases 0.398843285
## num_store_purchases -0.386560393
## num_web_visits_month 0.484419372
## children -0.246603893
## total_spent 0.148510899
## age -0.109763078
## income_log 0.010034507
## accepted_any1 0.638172342
The Ridge performs in the same way as the Lasso, which tends to bring the coefficients of the less influential variables to 0. But even though these variables are less influential, the fact that the two models perform in the same way allows us to say that they do not introduce noise.
roc_lasso <- roc(y_test, as.numeric(prob_lasso))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_ridge <- roc(y_test, as.numeric(prob_ridge))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lasso <- auc(roc_lasso)
auc_ridge <- auc(roc_ridge)
cat("AUC Lasso:", round(auc_lasso, 3), "\n")
## AUC Lasso: 0.863
cat("AUC Ridge:", round(auc_ridge, 3), "\n")
## AUC Ridge: 0.86
roc_lasso_df <- data.frame(
False_Positive_Rate = 1 - roc_lasso$specificities,
True_Positive_Rate = roc_lasso$sensitivities,
Model = "Lasso"
)
roc_ridge_df <- data.frame(
False_Positive_Rate = 1 - roc_ridge$specificities,
True_Positive_Rate = roc_ridge$sensitivities,
Model = "Ridge"
)
roc_df <- rbind(roc_lasso_df, roc_ridge_df)
ggplot(roc_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, color = Model)) +
geom_line(size = 1.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve - Lasso vs Ridge Regression",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)"
) +
theme_minimal(base_size = 14) +
scale_color_manual(values = c("Lasso" = "blue", "Ridge" = "skyblue")) +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
opt_lasso <- coords(roc_lasso, x = "best", best.method = "youden", ret = c("threshold"))
opt_ridge <- coords(roc_ridge, x = "best", best.method = "youden", ret = c("threshold"))
threshold_lasso <- as.numeric(opt_lasso)
threshold_ridge <- as.numeric(opt_ridge)
pred_lasso_class <- factor(ifelse(prob_lasso >= threshold_lasso, 1, 0), levels = c(0, 1))
pred_ridge_class <- factor(ifelse(prob_ridge >= threshold_ridge, 1, 0), levels = c(0, 1))
cm_lasso <- confusionMatrix(pred_lasso_class, y_test, positive = "1")
cm_ridge <- confusionMatrix(pred_ridge_class, y_test, positive = "1")
plot_cm <- function(cm, title) {
cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "#F7FBFF", high = "blue") +
labs(title = title, x = "Actual Class", y = "Predicted Class", fill = "Count") +
theme_minimal(base_size = 14)
}
Both models have a higher number of false positives than false negatives.
plot_cm(cm_lasso, "Confusion Matrix – Lasso")
plot_cm(cm_ridge, "Confusion Matrix – Ridge")
metrics_lasso_ridge <- tibble(
Model = c("Lasso", "Ridge"),
AUC = c(as.numeric(auc_lasso), as.numeric(auc_ridge)),
Precision = c(cm_lasso$byClass["Precision"], cm_ridge$byClass["Precision"]),
Recall = c(cm_lasso$byClass["Recall"], cm_ridge$byClass["Recall"]),
F1 = c(cm_lasso$byClass["F1"], cm_ridge$byClass["F1"]),
Specificity = c(cm_lasso$byClass["Specificity"], cm_ridge$byClass["Specificity"]),
Balanced_Accuracy = c(cm_lasso$byClass["Balanced Accuracy"], cm_ridge$byClass["Balanced Accuracy"])
)
metrics_lasso_ridge
## # A tibble: 2 × 7
## Model AUC Precision Recall F1 Specificity Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lasso 0.863 0.384 0.869 0.533 0.755 0.812
## 2 Ridge 0.860 0.373 0.859 0.520 0.746 0.803
coef_matrix <- coef(ridge_model, s = "lambda.min") %>% as.matrix()
coef_df <- data.frame(
Feature = rownames(coef_matrix),
Coefficient = coef_matrix[, 1]
)
coef_df <- coef_df %>%
filter(Feature != "(Intercept)") %>%
arrange(desc(abs(Coefficient)))
top_20_features <- coef_df %>%
slice_max(order_by = abs(Coefficient), n = 20)
ggplot(top_20_features, aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
geom_col(aes(fill = Coefficient > 0)) +
coord_flip() +
labs(
title = "Top Predictors Ridge",
x = "Variables",
y = "Coefficients"
) +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "blue", "FALSE" = "skyblue"),
labels = c("TRUE" = "Positivo", "FALSE" = "Negativo"),
name = "Influenza"
)
coef_lasso_matrix <- coef(lasso_model, s = "lambda.min") %>% as.matrix()
lasso_df <- data.frame(
Feature = rownames(coef_lasso_matrix),
Coefficient = coef_lasso_matrix[, 1]
)
lasso_clean <- lasso_df %>%
filter(Feature != "(Intercept)") %>%
filter(Coefficient != 0) %>%
arrange(desc(abs(Coefficient)))
cat("Il modello Lasso ha selezionato", nrow(lasso_clean), "variabili su", nrow(lasso_df) - 1, "\n")
## Il modello Lasso ha selezionato 17 variabili su 23
plot_data <- lasso_clean %>%
slice_max(order_by = abs(Coefficient), n = 20)
ggplot(plot_data, aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
geom_col(aes(fill = Coefficient > 0)) +
coord_flip() +
labs(
title = "Top Predictors Lasso",
x = "Variables",
y = "Coefficients"
) +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "blue", "FALSE" = "skyblue"),
labels = c("TRUE" = "Positivo", "FALSE" = "Negativo"),
name = "Influenza"
)
gam_formula <- as.formula(
response ~ s(age) + s(income_log) + s(recency) +
s(mnt_wines) + s(mnt_fruits) + s(mnt_meat_products) +
s(mnt_fish_products) + s(mnt_sweet_products) + s(mnt_gold_prods) +
education + marital_status + children +
num_deals_purchases + num_web_purchases + num_catalog_purchases +
num_store_purchases + num_web_visits_month + accepted_any
)
test <- na.omit(test)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-4. For overview type '?mgcv'.
gam_model <- suppressWarnings(gam(
gam_formula,
data = train,
family = binomial,
weights = class_weights
))
prob_gam <- predict(gam_model, test, type = "response")
Variables such as age, recency, mnt_fruits, mnt_fish_products, and mnt_sweet_products have an EDF of 1,000. Although the model has the freedom to fit curves, it has nevertheless chosen straight lines for these variables. The nonlinear relationships found concern income and meat products, which increase the probability of response up to a certain point. GOLD products also show a curvature, but it is not statistically significant (see p_value).
plot(gam_model, pages = 1, shade = TRUE, seWithMean = TRUE)
summary(gam_model)
##
## Family: binomial
## Link function: logit
##
## Formula:
## response ~ s(age) + s(income_log) + s(recency) + s(mnt_wines) +
## s(mnt_fruits) + s(mnt_meat_products) + s(mnt_fish_products) +
## s(mnt_sweet_products) + s(mnt_gold_prods) + education + marital_status +
## children + num_deals_purchases + num_web_purchases + num_catalog_purchases +
## num_store_purchases + num_web_visits_month + accepted_any
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.51122 1.07933 -3.253 0.001141 **
## educationUndergrad_Grad 0.28403 0.55747 0.509 0.610402
## educationMaster 0.62676 0.65171 0.962 0.336189
## educationPhD 1.32950 0.63563 2.092 0.036473 *
## marital_statusDivorced_Widow 0.28438 0.43744 0.650 0.515631
## marital_statusMarried -1.05726 0.38150 -2.771 0.005583 **
## marital_statusTogether -1.08111 0.42301 -2.556 0.010595 *
## children -0.65194 0.32277 -2.020 0.043402 *
## num_deals_purchases 0.26111 0.10788 2.420 0.015508 *
## num_web_purchases 0.03116 0.08793 0.354 0.723076
## num_catalog_purchases 0.26119 0.08480 3.080 0.002069 **
## num_store_purchases -0.15982 0.06795 -2.352 0.018661 *
## num_web_visits_month 0.39970 0.11495 3.477 0.000507 ***
## accepted_any1 1.94302 0.36463 5.329 9.89e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(age) 1.000 1.000 0.179 0.6722
## s(income_log) 5.352 6.561 13.094 0.0653 .
## s(recency) 1.001 1.003 30.288 <2e-16 ***
## s(mnt_wines) 1.963 2.510 2.121 0.4534
## s(mnt_fruits) 1.000 1.000 0.491 0.4834
## s(mnt_meat_products) 4.364 5.461 11.948 0.0369 *
## s(mnt_fish_products) 1.000 1.000 0.593 0.4415
## s(mnt_sweet_products) 1.000 1.001 0.026 0.8717
## s(mnt_gold_prods) 3.960 4.840 5.027 0.4837
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.462 Deviance explained = 44.3%
## UBRE = -0.75865 Scale est. = 1 n = 1548
roc_gam <- roc(y_test, prob_gam)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_gam <- auc(roc_gam)
opt_gam <- coords(roc_gam, x = "best", best.method = "youden", ret = "threshold")
threshold_gam <- as.numeric(opt_gam)
pred_gam_class <- factor(ifelse(prob_gam >= threshold_gam, 1, 0), levels = c(0, 1))
cm_gam <- confusionMatrix(pred_gam_class, y_test, positive = "1")
roc_gam_df <- data.frame(
False_Positive_Rate = 1 - roc_gam$specificities,
True_Positive_Rate = roc_gam$sensitivities
)
roc_gam <- roc(y_test, as.numeric(prob_gam))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lasso <- auc(roc_gam)
The ROC curve appears more solid, with the same AUC area as previous models.
ggplot(roc_gam_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
geom_line(color = "red", size = 1.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve - GAM",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)"
) +
theme_minimal(base_size = 14)
opt_gam <- coords(roc_gam, x = "best", best.method = "youden", ret = c("threshold"))
threshold_gam <- as.numeric(opt_gam)
pred_gam_class <- factor(ifelse(prob_gam >= threshold_gam, 1, 0), levels = c(0, 1))
cm_gam <- confusionMatrix(pred_gam_class, y_test, positive = "1")
The GAM model has succeeded in improving the ability to detect True Negatives compared to Ridge and Lasso, and is also better at identifying False Positives.
cm_df <- as.data.frame(cm_gam$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "#F7FBFF", high = "#D62728") +
labs(
title = "Confusion Matrix – GAM Logistic Regression",
x = "Actual Class",
y = "Predicted Class",
fill = "Count"
) +
theme_minimal(base_size = 14)
metrics_gam <- tibble(
Model = "GAM",
AUC = as.numeric(auc_gam),
Precision = cm_gam$byClass["Precision"],
Recall = cm_gam$byClass["Recall"],
F1 = cm_gam$byClass["F1"],
Specificity = cm_gam$byClass["Specificity"],
Balanced_Accuracy = cm_gam$byClass["Balanced Accuracy"]
)
print(metrics_gam)
## # A tibble: 1 × 7
## Model AUC Precision Recall F1 Specificity Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 GAM 0.869 0.491 0.788 0.605 0.856 0.822
s(recency) shows an almost perfect decline,each day of delay linearly “kills” the probability of success.
gam_summary <- summary(gam_model)
if (!is.null(gam_summary$s.table)) {
smooth_table <- as.data.frame(gam_summary$s.table)
stat_col_smooth <- grep("Chi\\.sq|F", colnames(smooth_table), value = TRUE)[1]
smooth_data <- data.frame(
Feature = rownames(smooth_table),
Statistic = smooth_table[[stat_col_smooth]],
Type = "Smooth"
)
} else {
smooth_data <- data.frame()
}
p_table <- gam_summary$p.table
if (!is.null(p_table) && nrow(p_table) > 0) {
p_df <- as.data.frame(p_table)
stat_col_linear <- grep("value", colnames(p_df), value = TRUE)[1]
stats_values <- if (!is.na(stat_col_linear)) p_df[[stat_col_linear]] else p_table[, 3]
linear_data <- data.frame(
Feature = rownames(p_df),
Statistic = stats_values^2,
Type = "Parametric (Linear)"
)
linear_data <- linear_data %>% filter(Feature != "(Intercept)")
} else {
linear_data <- data.frame()
}
gam_importance <- bind_rows(smooth_data, linear_data)
if (nrow(gam_importance) > 0) {
gam_importance <- gam_importance %>%
arrange(desc(Statistic)) %>%
slice_max(order_by = Statistic, n = 20)
ggplot(gam_importance, aes(x = reorder(Feature, Statistic), y = Statistic, fill = Type)) +
geom_col() +
coord_flip() +
labs(
title = "GAM",
subtitle = "Statistica del test (Chi-quadro approx)",
x = "Variables",
y = "Importance (Test Statistic)",
fill = "Tipo di Relazione"
) +
theme_minimal() +
scale_fill_manual(values = c("Smooth" = "red", "Parametric (Linear)" = "blue"))
} else {
print("Nessuna variabile significativa trovata o modello vuoto.")
}
set.seed(1926)
rf_model <- randomForest(
response ~ .,
data = train,
ntree = 500,
mtry = floor(sqrt(ncol(train) - 1)),
importance = TRUE
)
prob_rf <- predict(rf_model, test, type = "prob")[, 2]
Random Forests has high levels of both specificity and sensitivity.
roc_rf <- roc(y_test, prob_rf)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rf <- auc(roc_rf)
cat("AUC Random Forest:", round(auc_rf, 3), "\n")
## AUC Random Forest: 0.873
roc_rf_df <- data.frame(
False_Positive_Rate = 1 - roc_rf$specificities,
True_Positive_Rate = roc_rf$sensitivities
)
ggplot(roc_rf_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
geom_line(color = "darkgreen", size = 1.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve - Random Forest",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)"
) +
theme_minimal(base_size = 14)
opt_rf <- coords(roc_rf, x = "best", best.method = "youden", ret = c("threshold"))
threshold_rf <- as.numeric(opt_rf)
pred_rf_class <- factor(ifelse(prob_rf >= threshold_rf, 1, 0), levels = c(0, 1))
cm_rf <- confusionMatrix(pred_rf_class, y_test, positive = "1")
The model correctly identifies the vast majority of those who will not respond, reducing false positives to 94. It loses only 24 customers (false negatives).
cm_rf_df <- as.data.frame(cm_rf$table)
colnames(cm_rf_df) <- c("Predicted", "Actual", "Freq")
ggplot(cm_rf_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "#F7FBFF", high = "darkgreen") +
labs(
title = "Confusion Matrix – Random Forest",
x = "Actual Class",
y = "Predicted Class",
fill = "Count"
) +
theme_minimal(base_size = 14)
metrics_rf <- tibble(
Model = "Random Forest",
AUC = as.numeric(auc_rf),
Precision = cm_rf$byClass["Precision"],
Recall = cm_rf$byClass["Recall"],
F1 = cm_rf$byClass["F1"],
Specificity = cm_rf$byClass["Specificity"],
Balanced_Accuracy = cm_rf$byClass["Balanced Accuracy"]
)
print(metrics_rf)
## # A tibble: 1 × 7
## Model AUC Precision Recall F1 Specificity Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Random Forest 0.873 0.444 0.758 0.560 0.833 0.795
Recency continues to be the best predictor, followed by income and total spent.
imp_matrix <- importance(rf_model)
rf_imp_df <- data.frame(
Feature = rownames(imp_matrix),
Importance = imp_matrix[, ncol(imp_matrix)]
)
rf_clean <- rf_imp_df %>%
arrange(desc(Importance)) %>%
slice_max(order_by = Importance, n = 20)
ggplot(rf_clean, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Feature Importance - Random Forest",
x = "Variables",
y = "Importance"
) +
theme_minimal()
df_logistic <- df_logistic%>%
dplyr::select(-id)
dummies <- caret::dummyVars(~., data = df_logistic %>% dplyr::select(-response))
X_xgb_full <- predict(dummies, newdata = df_logistic)
y_xgb_full <- as.numeric(df_logistic$response) - 1
X_train_xgb <- X_xgb_full[idx, ]
y_train_xgb <- y_xgb_full[idx]
X_test_xgb <- X_xgb_full[-idx, ]
y_test_xgb <- y_xgb_full[-idx]
dtrain <- xgb.DMatrix(data = X_train_xgb, label = y_train_xgb)
dtest <- xgb.DMatrix(data = X_test_xgb, label = y_test_xgb)
set.seed(1926)
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.05,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8
)
scale_pos_weight <- sum(y_train_xgb == 0) / sum(y_train_xgb == 1)
params$scale_pos_weight <- scale_pos_weight
xgb_model <- suppressWarnings(xgb.train(
params = params,
data = dtrain,
nrounds = 500,
watchlist = list(train = dtrain, test = dtest),
print_every_n = 50,
early_stopping_rounds = 50,
verbose = 0
))
prob_xgb <- predict(xgb_model, dtest)
The ROC curve is closest to the upper left corner.
roc_xgb <- roc(y_test, prob_xgb)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_xgb <- auc(roc_xgb)
cat("AUC XGBoost:", round(auc_xgb, 3), "\n")
## AUC XGBoost: 0.888
roc_xgb_df <- data.frame(
False_Positive_Rate = 1 - roc_xgb$specificities,
True_Positive_Rate = roc_xgb$sensitivities
)
ggplot(roc_xgb_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
geom_line(color = "blue", size = 1.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve - XGBoost",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)"
) +
theme_minimal(base_size = 14)
opt_xgb <- coords(roc_xgb, x = "best", best.method = "youden", ret = c("threshold"))
threshold_xgb <- as.numeric(opt_xgb)
It identifies 83% of respondents (true positives) and has the lowest number of false negatives among the models tested.
pred_xgb_class <- factor(ifelse(prob_xgb >= threshold_xgb, 1, 0), levels = c(0, 1))
cm_xgb <- confusionMatrix(pred_xgb_class, y_test, positive = "1")
cm_xgb_df <- as.data.frame(cm_xgb$table)
colnames(cm_xgb_df) <- c("Predicted", "Actual", "Freq")
ggplot(cm_xgb_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "#F7FBFF", high = "blue") +
labs(
title = "Confusion Matrix – XGBoost",
x = "Actual Class",
y = "Predicted Class",
fill = "Count"
) +
theme_minimal(base_size = 14)
metrics_xgb <- tibble(
Model = "XGBoost",
AUC = as.numeric(auc_xgb),
Precision = cm_xgb$byClass["Precision"],
Recall = cm_xgb$byClass["Recall"],
F1 = cm_xgb$byClass["F1"],
Specificity = cm_xgb$byClass["Specificity"],
Balanced_Accuracy = cm_xgb$byClass["Balanced Accuracy"]
)
print(metrics_xgb)
## # A tibble: 1 × 7
## Model AUC Precision Recall F1 Specificity Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 XGBoost 0.888 0.474 0.828 0.603 0.839 0.833
XGBoost ranks variables based on Gain (the incremental contribution of each variable to the improvement of the model). It confirmed that Meat and Wines, together with Recency, are the “golden triangle” for predicting your customers’ behavior.
importance_matrix <- xgb.importance(model = xgb_model)
plot_data <- importance_matrix %>%
as.data.frame() %>%
top_n(20, wt = Gain) %>%
mutate(Feature = reorder(Feature, Gain))
ggplot(plot_data, aes(x = Feature, y = Gain, fill = Gain)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_viridis_c(option = "magma") +
geom_text(aes(label = round(Gain, 3)),
hjust = -0.2, size = 3.5, color = "black") +
theme_minimal() +
labs(
title = "Feature Importance (XGBoost)",
x = NULL,
y = "Importance (Gain)"
) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.y = element_text(size = 11, face = "bold")
)
In order to test the robustness and generalization ability of the models used, the data is split into 5 folds and the models are trained.
XGBoost has the highest average AUC, but Random Forest is more consistent because it has a lower standard deviation (it performs more or less the same across the 5 folds). Logistics has a very similar average to XGBoost.
set.seed(1926)
k <- 5
folds <- sample(rep(1:k, length.out = nrow(train)))
cv_results <- data.frame(Model = character(), Fold = integer(), AUC = numeric())
for (i in 1:k) {
test_idx <- which(folds == i)
cv_train <- train[-test_idx, ]
cv_test <- train[test_idx, ]
w_pos <- sum(cv_train$response == 0) / nrow(cv_train)
w_neg <- sum(cv_train$response == 1) / nrow(cv_train)
cv_weights <- ifelse(cv_train$response == 1, w_pos, w_neg)
m_log <- suppressWarnings(glm(response ~ ., data = cv_train, family = binomial, weights = cv_weights))
p_log <- predict(m_log, cv_test, type = "response")
auc_log <- auc(roc(cv_test$response, p_log, quiet = TRUE))
m_rf <- suppressWarnings(randomForest(response ~ ., data = cv_train, ntree = 100))
p_rf <- predict(m_rf, cv_test, type = "prob")[, 2]
auc_rf <- auc(roc(cv_test$response, p_rf, quiet = TRUE))
dummies_cv <- caret::dummyVars(~., data = cv_train %>% dplyr::select(-response))
X_cv_train <- predict(dummies_cv, newdata = cv_train)
y_cv_train <- as.numeric(cv_train$response) - 1
dtrain_fold <- xgb.DMatrix(data = X_cv_train, label = y_cv_train)
X_cv_test <- predict(dummies_cv, newdata = cv_test)
y_cv_test <- as.numeric(cv_test$response) - 1
dtest_fold <- xgb.DMatrix(data = X_cv_test, label = y_cv_test)
scale_pos <- sum(y_cv_train == 0) / sum(y_cv_train == 1)
params_cv <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
max_depth = 4,
scale_pos_weight = scale_pos
)
m_xgb <- suppressWarnings(xgb.train(params = params_cv, data = dtrain_fold, nrounds = 100, verbose = 0))
p_xgb <- predict(m_xgb, dtest_fold)
auc_xgb <- auc(roc(cv_test$response, p_xgb, quiet = TRUE))
cv_results <- rbind(
cv_results,
data.frame(Model = "Logistic", Fold = i, AUC = as.numeric(auc_log)),
data.frame(Model = "Random Forest", Fold = i, AUC = as.numeric(auc_rf)),
data.frame(Model = "XGBoost", Fold = i, AUC = as.numeric(auc_xgb))
)
}
cv_summary <- cv_results %>%
group_by(Model) %>%
summarise(
Mean_AUC = mean(AUC),
SD_AUC = sd(AUC),
Min_AUC = min(AUC),
Max_AUC = max(AUC)
)
print(cv_summary)
## # A tibble: 3 × 5
## Model Mean_AUC SD_AUC Min_AUC Max_AUC
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Logistic 0.871 0.0189 0.847 0.895
## 2 Random Forest 0.862 0.0161 0.840 0.882
## 3 XGBoost 0.874 0.0316 0.841 0.912
XGBoost is the best performing model in terms of pure predictive power, while GAM and Random Forest offer the best balance between accuracy and the ability to correctly identify “no” responses. If the goal is to avoid losing any potential customers at all costs, Lasso would be the best choice (recall). GAM is the most conservative and balanced model with the highest precision and F1 score.
All models converge on the same theory, with Recency, Accepted Any, and Meat Products driving the situation.
metrics_logistic <- metrics_logistic %>% mutate(Model = "Stepwise Logistic")
metrics_lasso_ridge <- metrics_lasso_ridge %>% mutate(Model = c("Lasso", "Ridge"))
metrics_rf <- metrics_rf %>% mutate(Model = "Random Forest")
metrics_gam <- metrics_gam %>% mutate(Model = "GAM")
metrics_xgb <- metrics_xgb %>% mutate(Model = "XGBoost")
all_metrics_sorted <- bind_rows(
metrics_logistic,
metrics_lasso_ridge,
metrics_rf,
metrics_gam,
metrics_xgb
) %>%
dplyr::select(Model, everything()) %>%
arrange(desc(AUC))
all_metrics_final <- as.data.frame(all_metrics_sorted)
rownames(all_metrics_final) <- NULL
print(all_metrics_final)
## Model AUC Precision Recall F1 Specificity
## 1 XGBoost 0.8879665 0.4739884 0.8282828 0.6029412 0.8386525
## 2 Random Forest 0.8725106 0.4437870 0.7575758 0.5597015 0.8333333
## 3 GAM 0.8688033 0.4905660 0.7878788 0.6046512 0.8563830
## 4 Lasso 0.8632692 0.3839286 0.8686869 0.5325077 0.7553191
## 5 Ridge 0.8598127 0.3728070 0.8585859 0.5198777 0.7464539
## 6 Stepwise Logistic 0.8572695 0.4041451 0.7878788 0.5342466 0.7960993
## Balanced_Accuracy
## 1 0.8334677
## 2 0.7954545
## 3 0.8221309
## 4 0.8120030
## 5 0.8025199
## 6 0.7919890
The predictive powers of the Recency and Accepted_any variables and meat products are confirmed.
shap_long <- shap.prep(xgb_model = xgb_model, X_train = X_test_xgb)
shap.plot.summary(shap_long) +
ggtitle("SHAP Summary Plot") +
theme_minimal()
The red curve shows an almost perfect decline. Below 25 days of Recency,
the impact is strongly positive (SHAP > 1). Above 50 days, the impact
becomes consistently negative. The color of the dots indicates spending
on meat.
shap.plot.dependence(data_long = shap_long, x = "recency",
color_feature = "auto") +
ggtitle("SHAP Dependence: Recency")
## `geom_smooth()` using formula = 'y ~ x'
The curve reaches a plateau around €1,000 and then begins a slight
downward parabola.
shap.plot.dependence(data_long = shap_long, x = "mnt_meat_products",
color_feature = "auto") +
ggtitle("SHAP Dependence: Meat Products")
## `geom_smooth()` using formula = 'y ~ x'